""" MultiQC module to parse output from the Seqera Platform CLI """

import datetime as dt
import json
import logging
import os
import re
import tarfile
from collections import defaultdict

import humanize

from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import bargraph
from multiqc.utils import mqc_colour

log = logging.getLogger(__name__)


def _read_json_from_tar_gz(tar_file, fname):
    try:
        fh = tar_file.extractfile(tar_file.getmember(fname))
        contents = fh.read()
    except Exception as e:
        log.warning(f"Could not extract file {fname} from archive {tar_file}: {e}")
        return {}
    try:
        data = json.loads(contents)
    except Exception as e:
        log.warning(f"Could parse JSON from {fname} in {tar_file}: {e}")
        return {}
    return data


class MultiqcModule(BaseMultiqcModule):
    """
    Seqera Platform CLI module for MultiQC. Reports stats from log dumps
    usually written in a form of a tar-gz archive, but also their uncompressed
    versions (that is, workflow.json and workflow-load.json files).
    To allow reading the tar-gz archives, run with `ignore_images: false`
    in the config, e.g.:
    ```
    multiqc . --cl-config 'ignore_images: false'
    ```
    """

    def __init__(self):
        super(MultiqcModule, self).__init__(
            name="Seqera Platform CLI",
            anchor="seqera_cli",
            href="https://github.com/seqeralabs/tower-cli",
            info="reports statistics generated by the Seqera Platform CLI.",
            # doi="",  # No DOI for this tool
        )

        data_by_run = defaultdict(dict)

        # Parsing the tar-gz dump
        for f in self.find_log_files("seqera_cli/run_dump", filecontents=False):
            with tarfile.open(os.path.join(f["root"], f["fn"])) as tar_file:
                if "workflow.json" not in tar_file.getnames():
                    continue
                d = _read_json_from_tar_gz(tar_file, "workflow.json")

                # Check other files that sit next to workflow.json
                if "workflow-load.json" in tar_file.getnames():
                    d.update(_read_json_from_tar_gz(tar_file, "workflow-load.json"))

                if "service-info.json" in tar_file.getnames():
                    d.update(_read_json_from_tar_gz(tar_file, "service-info.json"))

                if "workflow-metadata.json" in tar_file.getnames():
                    d.update(_read_json_from_tar_gz(tar_file, "workflow-metadata.json"))

                d = self._parse_data(d)
                if d:
                    self.add_data_source(f, s_name=d["id_repository"])
                    data_by_run[d["id_repository"]].update(d)

        # Parsing the json files directly
        for f in self.find_log_files("seqera_cli/json"):
            d = json.loads(f["f"])
            if not d:
                continue

            # Check other files that sit next to workflow.json
            load_path = os.path.join(f["root"], "workflow-load.json")
            if os.path.isfile(load_path):
                with open(load_path) as fh:
                    d.update(json.load(fh))

            service_info_path = os.path.join(f["root"], "service-info.json")
            if os.path.isfile(service_info_path):
                with open(service_info_path) as fh:
                    d.update(json.load(fh))

            workflow_metadata_path = os.path.join(f["root"], "workflow-metadata.json")
            if os.path.isfile(workflow_metadata_path):
                with open(workflow_metadata_path) as fh:
                    d.update(json.load(fh))

            d = self._parse_data(d)
            if d:
                self.add_data_source(f, s_name=d["id_repository"])
                data_by_run[d["id_repository"]].update(d)
                if os.path.isfile(load_path):
                    self.add_data_source(source=load_path, s_name=d["id_repository"])
                if os.path.isfile(service_info_path):
                    self.add_data_source(source=service_info_path, s_name=d["id_repository"])

        # Figure out the org/workspace and save as a separate field
        # Needed so that we can have it as a separate column in the table
        for d in data_by_run.values():
            run_url_re = re.compile(r"\/orgs\/([^\/]+)\/workspaces\/([^\/]+)\/watch\/([^\/]+)\/?$")
            if d.get("runUrl"):
                m = run_url_re.search(d["runUrl"])
                if m:
                    org, workspace, run = m.groups()
                    d["org"] = org
                    d["workspace"] = workspace
                    d["org_workspace"] = f"{org}/{workspace}"

        # Filter to strip out ignored sample names
        data_by_run = self.ignore_samples(data_by_run)
        if len(data_by_run) == 0:
            raise ModuleNoSamplesFound
        log.info(f"Found {len(data_by_run)} reports")

        # Write parsed report data to a file
        self.write_data_file(data_by_run, "multiqc_seqera_cli")

        self._table(data_by_run)

        self._plots(data_by_run)

    def _parse_data(self, d):
        keys = [
            # workflow.json
            "id",
            "runUrl",
            "repository",
            "start",
            "complete",
            "revision",
            "nextflow",
            # workflow-load.json
            "cpuEfficiency",
            "memoryEfficiency",
            "cpuTime",
            "readBytes",
            "writeBytes",
            "cost",
            "pending",
            "submitted",
            "running",
            "succeeded",
            "failed",
            "cached",
            # service-info.json
            "version",
        ]
        d = {k: d.get(k) for k in keys if k in d}

        if not d.get("id") or not d.get("repository"):
            return None

        repo = d["repository"]
        repo = repo.replace("https://", "").replace("http://", "").replace("github.com/", "")
        d["id_repository"] = f"{repo}_{d['id']}"

        # "start" and "complete" are time stamps like time stamps like 2023-10-22T14:39:01Z
        # parse them with a library, take the difference "complete" - "start" to get the
        # wall time, and convert the wall time it to a human-readable format.
        if "start" in d and "complete" in d and d["complete"] is not None:
            start = dt.datetime.strptime(d["start"], "%Y-%m-%dT%H:%M:%SZ")
            complete = dt.datetime.strptime(d["complete"], "%Y-%m-%dT%H:%M:%SZ")
            wall_time = complete - start
            d["wallTime"] = wall_time.total_seconds()
            d["start"] = start.timestamp()
            d["complete"] = complete.timestamp()

        version = d.get("version")
        if version:
            d["seqeraVersion"] = version
            self.add_software_version(version, sample=d["id_repository"], software_name="Seqera Platform")

        nextflow_version = d.get("nextflow", {}).get("version")
        if nextflow_version:
            d["nextflowVersion"] = nextflow_version
            self.add_software_version(nextflow_version, sample=d["id_repository"], software_name="Nextflow")

        return d

    def _table(self, data_by_run):
        """
        Add the main table with run statistics.
        """
        # Collecting categorical values into distinct lists that we want to color code
        # with badges and backgrounds:
        seqera_versions = list(set(d.get("seqeraVersion") for d in data_by_run.values()))
        nextflow_versions = list(set(d.get("nextflowVersion") for d in data_by_run.values()))
        scale = mqc_colour.mqc_colour_scale("Dark2")
        version_colors = [
            {v: scale.get_colour(i, lighten=0.5)} for i, v in enumerate(seqera_versions + nextflow_versions)
        ]
        repositories = list(set(d.get("repository") for d in data_by_run.values()))

        def format_run_url(x):
            runUrl_re = re.compile(r"\/orgs\/([^\/]+)\/workspaces\/([^\/]+)\/watch\/([^\/]+)\/?$")
            if x:
                m = runUrl_re.search(x)
                if m:
                    org, workspace, run = m.groups()
                    return f'<a href="{x}" style="white-space: nowrap;" target="_blank">{run}</a>'
            return str(x)

        headers = {
            "runUrl": {
                "title": "Run ID",
                "description": "Workflow run ID",
                "scale": False,
                "format": format_run_url,
            },
            "org_workspace": {
                "title": "Workspace",
                "description": "Organisation and workspace",
                "scale": False,
            },
            "repository": {
                "title": "Repository",
                "description": "Name of the repository",
                "scale": "Accent",
                "modify": lambda x: repositories.index(x),
                "format": lambda x: f'<a href="{repositories[x]}" style="white-space: nowrap;">{repositories[x].replace("https://", "").replace("http://", "").replace("github.com/", "")}</a>',
            },
            "revision": {
                "title": "Version",
                "description": "Pipeline version",
                "scale": False,
            },
            "start": {
                "title": "Start",
                "description": "Start time of the workflow",
                "hidden": True,
                "format": lambda x: humanize.naturaltime(dt.datetime.fromtimestamp(x)).replace(" ", "&nbsp;"),
            },
            "complete": {
                "title": "Complete",
                "description": "End time of the workflow",
                "hidden": True,
                "format": lambda x: humanize.naturaltime(dt.datetime.fromtimestamp(x)).replace(" ", "&nbsp;"),
            },
            "wallTime": {
                "title": "Wall time",
                "description": "Duration of the workflow",
                "format": lambda x: str(dt.timedelta(seconds=x)).replace(" ", "&nbsp;"),
                "scale": "BuPu",
            },
            "cpuTime": {
                "title": "CPU time",
                "description": "Total CPU time used by the workflow",
                "modify": lambda x: x // 1000 / 60 / 60,  # hours
                "suffix": "&nbsp;h",
                "scale": "Greys",
            },
            "cost": {
                "title": "Est. cost",
                "description": "Estimated cost of the workflow",
                "format": "${:,.2f}",
                "scale": "Reds",
            },
            "readBytes": {
                "title": "Read GB",
                "description": "Total gigabytes read by the workflow",
                "format": lambda x: humanize.naturalsize(x),
                "scale": "Blues",
            },
            "writeBytes": {
                "title": "Write GB",
                "description": "Total gigabytes written by the workflow",
                "format": lambda x: humanize.naturalsize(x),
                "scale": "Greens",
            },
            "cpuEfficiency": {
                "title": "CPU efficiency",
                "description": "Percentage of CPU time used by the workflow",
                "format": "{:,.2f}",
                "suffix": "&nbsp;%",
                "max": 100,
                "scale": "RdYlGn",
            },
            "memoryEfficiency": {
                "title": "Memory efficiency",
                "description": "Percentage of memory used by the workflow",
                "format": "{:,.2f}",
                "suffix": "&nbsp;%",
                "max": 100,
                "scale": "YlGn",
            },
            "nextflowVersion": {
                "title": "Nextflow",
                "description": "Version of Nextflow",
                "cond_formatting_colours": version_colors,
                "cond_formatting_rules": {v: [{"s_eq": v}] for v in nextflow_versions},
                "scale": False,
            },
        }
        self.general_stats_addcols(data_by_run, headers)

    def _plots(self, data_by_run):
        """
        Add bar plots
        """
        self.add_section(
            name="Workflow status",
            anchor="seqera_cli_process_statuses_section",
            plot=bargraph.plot(
                data_by_run,
                cats={
                    "pending": {"name": "Pending", "color": "#8f4199"},
                    "submitted": {"name": "Submitted", "color": "#e68642"},
                    "running": {"name": "Running", "color": "#4256e7"},
                    "cached": {"name": "Cached", "color": "#939598"},
                    "succeeded": {"name": "Succeeded", "color": "#28ae61"},
                    "failed": {"name": "Failed", "color": "#e7363e"},
                },
                pconfig={
                    "id": "seqera_cli_process_statuses_plot",
                    "title": "Seqera platform CLI: Workflow status",
                    "ylab": "Number of tasks",
                },
            ),
        )

        plot_data = dict()
        for sn, data in data_by_run.items():
            plot_data[sn] = dict()
            if "wallTime" in data:
                plot_data[sn]["wallTime"] = data["wallTime"] / 60 / 60  # hours
            if "cpuTime" in data is not None:
                plot_data[sn]["cpuTime"] = data["cpuTime"] // 1000 / 60 / 60  # hours
            if "cost" in data is not None:
                plot_data[sn]["cost"] = data["cost"]

        self.add_section(
            name="Wall time",
            anchor="seqera_cli_wall_time_section",
            plot=bargraph.plot(
                plot_data,
                {"wallTime": {"name": "Wall time"}},
                {
                    "id": "seqera_cli_wall_time_plot",
                    "title": "Seqera platform CLI: Wall time",
                    "ylab": "hours",
                    "tt_decimals": 1,
                    "tt_suffix": "&nbsp;h",
                    "tt_percentages": False,
                    "cpswitch": False,
                    "hide_zero_cats": False,
                },
            ),
        )
        self.add_section(
            name="CPU time",
            anchor="seqera_cli_cpu_time_section",
            plot=bargraph.plot(
                plot_data,
                {"cpuTime": {"name": "CPU time"}},
                {
                    "id": "seqera_cli_cpu_time_plot",
                    "title": "Seqera platform CLI: CPU time",
                    "ylab": "CPU hours",
                    "tt_decimals": 1,
                    "tt_suffix": "&nbsp;h",
                    "tt_percentages": False,
                    "cpswitch": False,
                    "hide_zero_cats": False,
                },
            ),
        )
        self.add_section(
            name="Estimated cost",
            anchor="seqera_cli_cost_section",
            plot=bargraph.plot(
                plot_data,
                {"cost": {"name": "Estimated cost"}},
                {
                    "id": "seqera_cli_cost_plot",
                    "title": "Seqera platform CLI: Estimated cost",
                    "ylab": "$",
                    "tt_decimals": 1,
                    "tt_suffix": "&nbsp;$",
                    "tt_percentages": False,
                    "cpswitch": False,
                    "hide_zero_cats": False,
                },
            ),
        )
